InĀ [1]:
# Step-by-Step Process for Your Jupyter Notebook
# 1. Import Necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
InĀ [2]:
# 2. Load both datasets
df_vehicles = pd.read_csv('Electric_Vehicle_Population_Data.csv')
df_population_history = pd.read_csv('Electric_Vehicle_Population_Size_History_By_County.csv')
# Display first few rows to understand the structure
df_vehicles.head(), df_population_history.head()
Out[2]:
( VIN (1-10) County City State Postal Code Model Year Make \
0 5UXTA6C0XM Kitsap Seabeck WA 98380.0 2021 BMW
1 5YJ3E1EB1J Kitsap Poulsbo WA 98370.0 2018 TESLA
2 WP0AD2A73G Snohomish Bothell WA 98012.0 2016 PORSCHE
3 5YJ3E1EB5J Kitsap Bremerton WA 98310.0 2018 TESLA
4 1N4AZ1CP3K King Redmond WA 98052.0 2019 NISSAN
Model Electric Vehicle Type \
0 X5 Plug-in Hybrid Electric Vehicle (PHEV)
1 MODEL 3 Battery Electric Vehicle (BEV)
2 PANAMERA Plug-in Hybrid Electric Vehicle (PHEV)
3 MODEL 3 Battery Electric Vehicle (BEV)
4 LEAF Battery Electric Vehicle (BEV)
Clean Alternative Fuel Vehicle (CAFV) Eligibility Electric Range \
0 Clean Alternative Fuel Vehicle Eligible 30.0
1 Clean Alternative Fuel Vehicle Eligible 215.0
2 Not eligible due to low battery range 15.0
3 Clean Alternative Fuel Vehicle Eligible 215.0
4 Clean Alternative Fuel Vehicle Eligible 150.0
Base MSRP Legislative District DOL Vehicle ID \
0 0.0 35.0 267929112
1 0.0 23.0 475911439
2 0.0 1.0 101971278
3 0.0 23.0 474363746
4 0.0 45.0 476346482
Vehicle Location \
0 POINT (-122.8728334 47.5798304)
1 POINT (-122.6368884 47.7469547)
2 POINT (-122.206146 47.839957)
3 POINT (-122.6231895 47.5930874)
4 POINT (-122.13158 47.67858)
Electric Utility 2020 Census Tract
0 PUGET SOUND ENERGY INC 5.303509e+10
1 PUGET SOUND ENERGY INC 5.303509e+10
2 PUGET SOUND ENERGY INC 5.306105e+10
3 PUGET SOUND ENERGY INC 5.303508e+10
4 PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA) 5.303303e+10 ,
Date County State Vehicle Primary Use \
0 June 30 2024 Caddo LA Passenger
1 December 31 2017 Pend Oreille WA Truck
2 April 30 2017 Lincoln WA Truck
3 February 28 2022 El Paso CO Passenger
4 January 31 2017 DeKalb GA Passenger
Battery Electric Vehicles (BEVs) Plug-In Hybrid Electric Vehicles (PHEVs) \
0 2 0
1 0 0
2 0 0
3 2 2
4 1 0
Electric Vehicle (EV) Total Non-Electric Vehicle Total Total Vehicles \
0 2 10 12
1 0 5619 5619
2 0 4464 4464
3 4 663 667
4 1 92 93
Percent Electric Vehicles
0 16.67
1 0.00
2 0.00
3 0.60
4 1.08 )
InĀ [3]:
# 3. Initial Data Inspection
# Check for data types, missing values, and basic information
df_vehicles.info()
df_population_history.info()
# Check for missing or NaN values in both datasets
df_vehicles.isnull().sum(), df_population_history.isnull().sum()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 210165 entries, 0 to 210164 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 VIN (1-10) 210165 non-null object 1 County 210161 non-null object 2 City 210161 non-null object 3 State 210165 non-null object 4 Postal Code 210161 non-null float64 5 Model Year 210165 non-null int64 6 Make 210165 non-null object 7 Model 210165 non-null object 8 Electric Vehicle Type 210165 non-null object 9 Clean Alternative Fuel Vehicle (CAFV) Eligibility 210165 non-null object 10 Electric Range 210160 non-null float64 11 Base MSRP 210160 non-null float64 12 Legislative District 209720 non-null float64 13 DOL Vehicle ID 210165 non-null int64 14 Vehicle Location 210155 non-null object 15 Electric Utility 210161 non-null object 16 2020 Census Tract 210161 non-null float64 dtypes: float64(5), int64(2), object(10) memory usage: 27.3+ MB <class 'pandas.core.frame.DataFrame'> RangeIndex: 23439 entries, 0 to 23438 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 23439 non-null object 1 County 23346 non-null object 2 State 23346 non-null object 3 Vehicle Primary Use 23439 non-null object 4 Battery Electric Vehicles (BEVs) 23439 non-null int64 5 Plug-In Hybrid Electric Vehicles (PHEVs) 23439 non-null int64 6 Electric Vehicle (EV) Total 23439 non-null int64 7 Non-Electric Vehicle Total 23439 non-null int64 8 Total Vehicles 23439 non-null int64 9 Percent Electric Vehicles 23439 non-null float64 dtypes: float64(1), int64(5), object(4) memory usage: 1.8+ MB
Out[3]:
(VIN (1-10) 0 County 4 City 4 State 0 Postal Code 4 Model Year 0 Make 0 Model 0 Electric Vehicle Type 0 Clean Alternative Fuel Vehicle (CAFV) Eligibility 0 Electric Range 5 Base MSRP 5 Legislative District 445 DOL Vehicle ID 0 Vehicle Location 10 Electric Utility 4 2020 Census Tract 4 dtype: int64, Date 0 County 93 State 93 Vehicle Primary Use 0 Battery Electric Vehicles (BEVs) 0 Plug-In Hybrid Electric Vehicles (PHEVs) 0 Electric Vehicle (EV) Total 0 Non-Electric Vehicle Total 0 Total Vehicles 0 Percent Electric Vehicles 0 dtype: int64)
InĀ [4]:
# 4. Data Cleaning
# 4.1 Handling Missing Values
# Drop rows with critical missing values in 'VIN', 'Make', 'Model', etc.
df_vehicles_cleaned = df_vehicles.dropna(subset=['VIN (1-10)', 'Make', 'Model', 'County'])
# Fill or drop NaN values in 'Electric Range' and 'Base MSRP'
df_vehicles_cleaned['Electric Range'].fillna(df_vehicles_cleaned['Electric Range'].mean(), inplace=True)
df_vehicles_cleaned['Base MSRP'].fillna(df_vehicles_cleaned['Base MSRP'].mean(), inplace=True)
# Verify if missing values are handled
df_vehicles_cleaned.isnull().sum()
C:\Users\Sukhpreet\AppData\Local\Temp\ipykernel_100340\1211376461.py:8: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
df_vehicles_cleaned['Electric Range'].fillna(df_vehicles_cleaned['Electric Range'].mean(), inplace=True)
C:\Users\Sukhpreet\AppData\Local\Temp\ipykernel_100340\1211376461.py:8: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df_vehicles_cleaned['Electric Range'].fillna(df_vehicles_cleaned['Electric Range'].mean(), inplace=True)
C:\Users\Sukhpreet\AppData\Local\Temp\ipykernel_100340\1211376461.py:9: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
df_vehicles_cleaned['Base MSRP'].fillna(df_vehicles_cleaned['Base MSRP'].mean(), inplace=True)
C:\Users\Sukhpreet\AppData\Local\Temp\ipykernel_100340\1211376461.py:9: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df_vehicles_cleaned['Base MSRP'].fillna(df_vehicles_cleaned['Base MSRP'].mean(), inplace=True)
Out[4]:
VIN (1-10) 0 County 0 City 0 State 0 Postal Code 0 Model Year 0 Make 0 Model 0 Electric Vehicle Type 0 Clean Alternative Fuel Vehicle (CAFV) Eligibility 0 Electric Range 0 Base MSRP 0 Legislative District 441 DOL Vehicle ID 0 Vehicle Location 6 Electric Utility 0 2020 Census Tract 0 dtype: int64
InĀ [5]:
# 4.2 Handling Outliers
# Check for outliers in 'Electric Range' and 'Base MSRP'
sns.boxplot(x=df_vehicles_cleaned['Electric Range'])
plt.show()
# Remove extreme outliers in 'Electric Range'
df_vehicles_cleaned = df_vehicles_cleaned[df_vehicles_cleaned['Electric Range'] < 500]
InĀ [6]:
# 5. Create Index and Use loc/iloc
# Create a new index (use VIN or another field as the index)
df_vehicles_cleaned.set_index('VIN (1-10)', inplace=True)
# Use loc/iloc to access specific rows and columns
# Access data for a specific vehicle by VIN
df_vehicles_cleaned.loc['5YJ3E1EB1J']
# Access first 5 rows
df_vehicles_cleaned.iloc[0:5]
Out[6]:
| County | City | State | Postal Code | Model Year | Make | Model | Electric Vehicle Type | Clean Alternative Fuel Vehicle (CAFV) Eligibility | Electric Range | Base MSRP | Legislative District | DOL Vehicle ID | Vehicle Location | Electric Utility | 2020 Census Tract | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| VIN (1-10) | ||||||||||||||||
| 5UXTA6C0XM | Kitsap | Seabeck | WA | 98380.0 | 2021 | BMW | X5 | Plug-in Hybrid Electric Vehicle (PHEV) | Clean Alternative Fuel Vehicle Eligible | 30.0 | 0.0 | 35.0 | 267929112 | POINT (-122.8728334 47.5798304) | PUGET SOUND ENERGY INC | 5.303509e+10 |
| 5YJ3E1EB1J | Kitsap | Poulsbo | WA | 98370.0 | 2018 | TESLA | MODEL 3 | Battery Electric Vehicle (BEV) | Clean Alternative Fuel Vehicle Eligible | 215.0 | 0.0 | 23.0 | 475911439 | POINT (-122.6368884 47.7469547) | PUGET SOUND ENERGY INC | 5.303509e+10 |
| WP0AD2A73G | Snohomish | Bothell | WA | 98012.0 | 2016 | PORSCHE | PANAMERA | Plug-in Hybrid Electric Vehicle (PHEV) | Not eligible due to low battery range | 15.0 | 0.0 | 1.0 | 101971278 | POINT (-122.206146 47.839957) | PUGET SOUND ENERGY INC | 5.306105e+10 |
| 5YJ3E1EB5J | Kitsap | Bremerton | WA | 98310.0 | 2018 | TESLA | MODEL 3 | Battery Electric Vehicle (BEV) | Clean Alternative Fuel Vehicle Eligible | 215.0 | 0.0 | 23.0 | 474363746 | POINT (-122.6231895 47.5930874) | PUGET SOUND ENERGY INC | 5.303508e+10 |
| 1N4AZ1CP3K | King | Redmond | WA | 98052.0 | 2019 | NISSAN | LEAF | Battery Electric Vehicle (BEV) | Clean Alternative Fuel Vehicle Eligible | 150.0 | 0.0 | 45.0 | 476346482 | POINT (-122.13158 47.67858) | PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA) | 5.303303e+10 |
InĀ [36]:
# 6. Rearrange/Count Columns
# Rearrange columns to focus on key data
df_vehicles_cleaned = df_vehicles_cleaned[['Make', 'Model', 'Electric Vehicle Type', 'Electric Range', 'Base MSRP', 'County']]
# Count occurrences of electric vehicle types
ev_type_counts = df_vehicles_cleaned['Electric Vehicle Type'].value_counts()
print(ev_type_counts)
Electric Vehicle Type Battery Electric Vehicle (BEV) 165552 Plug-in Hybrid Electric Vehicle (PHEV) 44609 Name: count, dtype: int64
InĀ [16]:
# 7. Summary Statistics (Max/Min/Median/Describe)
# Basic descriptive statistics
df_vehicles_cleaned.describe()
# Max and Min values
max_range = df_vehicles_cleaned['Electric Range'].max()
min_range = df_vehicles_cleaned['Electric Range'].min()
median_range = df_vehicles_cleaned['Electric Range'].median()
print(f"Max Electric Range: {max_range}, Min Electric Range: {min_range}, Median Electric Range: {median_range}")
Max Electric Range: 337.0, Min Electric Range: 0.0, Median Electric Range: 0.0
InĀ [18]:
# 8. Visualization: Explore the Data
# 8.1 Electric Vehicle Range Distribution
# Plot the distribution of Electric Range
sns.histplot(df_vehicles_cleaned['Electric Range'], kde=True)
plt.title('Distribution of Electric Vehicle Range')
plt.xlabel('Electric Range')
plt.ylabel('Frequency')
plt.show()
InĀ [60]:
# 8.2 Count of Electric Vehicles by CIties
# Count the number of electric vehicles by Cities
county_counts = df_vehicles_cleaned['County'].value_counts()
# Plot the top counties with the most electric vehicles
county_counts.head(10).plot(kind='bar', figsize=(10,6), color='skyblue')
plt.title('Top 10 Cities by Electric Vehicle Registration')
plt.xlabel('County')
plt.ylabel('Number of Vehicles')
plt.xticks(rotation=0)
plt.show()
InĀ [38]:
# 8.3 Bubble Chart of Electric Range vs MSRP
# Bubble chart showing Electric Range vs MSRP for different vehicle types
fig = px.scatter(df_vehicles_cleaned, x="Electric Range", y="Base MSRP", color="Electric Vehicle Type",
size="Electric Range", hover_name="Model", size_max=20, title="Electric Range vs MSRP")
fig.show()
InĀ [42]:
# Convert 'Date' column to datetime format with automatic inference
df_population_history['Date'] = pd.to_datetime(df_population_history['Date'], errors='coerce')
# Check the first few rows to confirm the conversion
print(df_population_history['Date'].head())
0 2024-06-30 1 2017-12-31 2 2017-04-30 3 2022-02-28 4 2017-01-31 Name: Date, dtype: datetime64[ns]
InĀ [46]:
# Check the unique values in the 'Date' column
print(df_population_history['Date'].unique())
<DatetimeArray> ['2024-06-30 00:00:00', '2017-12-31 00:00:00', '2017-04-30 00:00:00', '2022-02-28 00:00:00', '2017-01-31 00:00:00', '2019-07-31 00:00:00', '2019-03-31 00:00:00', '2024-07-31 00:00:00', '2021-03-31 00:00:00', '2022-08-31 00:00:00', '2024-04-30 00:00:00', '2020-06-30 00:00:00', '2018-05-31 00:00:00', '2022-04-30 00:00:00', '2020-04-30 00:00:00', '2017-11-30 00:00:00', '2024-02-29 00:00:00', '2019-08-31 00:00:00', '2023-07-31 00:00:00', '2022-03-31 00:00:00', '2020-11-30 00:00:00', '2019-06-30 00:00:00', '2019-10-31 00:00:00', '2019-12-31 00:00:00', '2024-05-31 00:00:00', '2022-06-30 00:00:00', '2021-05-31 00:00:00', '2022-10-31 00:00:00', '2023-02-28 00:00:00', '2023-01-31 00:00:00', '2023-06-30 00:00:00', '2022-07-31 00:00:00', '2024-01-31 00:00:00', '2022-11-30 00:00:00', '2023-11-30 00:00:00', '2023-04-30 00:00:00', '2018-11-30 00:00:00', '2021-12-31 00:00:00', '2019-02-28 00:00:00', '2018-10-31 00:00:00', '2020-08-31 00:00:00', '2017-09-30 00:00:00', '2022-01-31 00:00:00', '2022-05-31 00:00:00', '2021-01-31 00:00:00', '2024-03-31 00:00:00', '2018-03-31 00:00:00', '2020-12-31 00:00:00', '2018-08-31 00:00:00', '2023-10-31 00:00:00', '2020-07-31 00:00:00', '2020-03-31 00:00:00', '2020-09-30 00:00:00', '2020-02-29 00:00:00', '2020-01-31 00:00:00', '2018-04-30 00:00:00', '2017-08-31 00:00:00', '2021-10-31 00:00:00', '2018-02-28 00:00:00', '2023-05-31 00:00:00', '2024-09-30 00:00:00', '2020-05-31 00:00:00', '2018-06-30 00:00:00', '2023-12-31 00:00:00', '2023-08-31 00:00:00', '2017-06-30 00:00:00', '2018-01-31 00:00:00', '2021-09-30 00:00:00', '2021-02-28 00:00:00', '2024-08-31 00:00:00', '2017-05-31 00:00:00', '2022-09-30 00:00:00', '2023-09-30 00:00:00', '2021-11-30 00:00:00', '2018-09-30 00:00:00', '2017-02-28 00:00:00', '2019-01-31 00:00:00', '2023-03-31 00:00:00', '2018-12-31 00:00:00', '2019-09-30 00:00:00', '2021-06-30 00:00:00', '2022-12-31 00:00:00', '2020-10-31 00:00:00', '2017-07-31 00:00:00', '2018-07-31 00:00:00', '2019-05-31 00:00:00', '2021-07-31 00:00:00', '2021-08-31 00:00:00', '2017-10-31 00:00:00', '2019-04-30 00:00:00', '2021-04-30 00:00:00', '2017-03-31 00:00:00', '2019-11-30 00:00:00'] Length: 93, dtype: datetime64[ns]
InĀ [50]:
# 9. Further Insights from Population History Dataset
# 9.1 Electric Vehicle Growth Over Time (Line Plot)
# Plot Electric Vehicle growth over time
plt.figure(figsize=(12, 6))
df_population_history.groupby('Date')['Electric Vehicle (EV) Total'].sum().plot(kind='line', color='green')
plt.title('Electric Vehicle Population Growth Over Time')
plt.xlabel('Year')
plt.ylabel('Total Electric Vehicles')
plt.show()
InĀ [54]:
# 9.2 County-Level Electric Vehicle Percentage
# Create a plot of the percent of electric vehicles by county
county_ev_percent = df_population_history.groupby('County')['Percent Electric Vehicles'].mean().sort_values(ascending=False)
county_ev_percent.head(10).plot(kind='bar', figsize=(10,6), color='orange')
plt.title('Top 10 Counties by Percentage of Electric Vehicles')
plt.xlabel('County')
plt.ylabel('Percent of Electric Vehicles')
plt.xticks(rotation=0)
plt.show()